Task 1

a)

In this task, I am given an accelerometer data for different gestures. It wouldn’t make much sense to visualize the acceleration information, so with below code, I calculated the displacements in each axes. The 3d plots are interactive, it can be moved around with the mouse

require(plotly)
trainx<-as.matrix(read.table("uWaveGestureLibrary_X_TRAIN"))
trainy<-as.matrix(read.table("uWaveGestureLibrary_Y_TRAIN"))
trainz<-as.matrix(read.table("uWaveGestureLibrary_Z_TRAIN"))

class<-trainx[,1]
trainx<-trainx[,-1]
velx<-matrix(0,nrow=896,ncol=315)

for(i in 1:nrow(trainx)) {
  velx[i,] <- cumsum(trainx[i,]) }
dispx<-matrix(0,nrow=896,ncol=315)

for(i in 1:nrow(velx)) {
  dispx[i,] <- cumsum(velx[i,]) }

trainy<-trainy[,-1]
vely<-matrix(0,nrow=896,ncol=315)

for(i in 1:nrow(trainy)) {
  vely[i,] <- cumsum(trainy[i,]) }
dispy<-matrix(0,nrow=896,ncol=315)

for(i in 1:nrow(vely)) {
  dispy[i,] <- cumsum(vely[i,]) }
trainz<-trainz[,-1]
velz<-matrix(0,nrow=896,ncol=315)

for(i in 1:nrow(trainz)) {
  velz[i,] <- cumsum(trainz[i,]) }
dispz<-matrix(0,nrow=896,ncol=315)

for(i in 1:nrow(velz)) {
  dispz[i,] <- cumsum(velz[i,]) }

##First gesture example
p1 <- plot_ly(x = ~dispx[11,], y = ~dispy[11,], z = ~dispz[11,])%>%
  add_markers()
p1
##Second gesture example
p2 <- plot_ly(x = ~dispx[71,], y = ~dispy[71,], z = ~dispz[71,])%>%
  add_markers()
p2
##Third gesture example
p3 <- plot_ly(x = ~dispx[4,], y = ~dispy[4,], z = ~dispz[4,])%>%
  add_markers()
p3
##Fourth gesture example
p4 <- plot_ly(x = ~dispx[5,], y = ~dispy[5,], z = ~dispz[5,])%>%
  add_markers()
p4
##Fifth gesture example
p5 <- plot_ly(x = ~dispx[2,], y = ~dispy[2,], z = ~dispz[2,])%>%
  add_markers()
p5
##Sixth gesture example
p6 <- plot_ly(x = ~dispx[1,], y = ~dispy[1,], z = ~dispz[1,])%>%
  add_markers()
p6
##Seventh gesture example
p7 <- plot_ly(x = ~dispx[7,], y = ~dispy[7,], z = ~dispz[7,])%>%
  add_markers()
p7
##Eight gesture example
p8 <- plot_ly(x = ~dispx[6,], y = ~dispy[6,], z = ~dispz[6,])%>%
  add_markers()
p8

You can see from the graphs that the graphs looks similar to the gestures they represent.

b)

I tried two distance measures for KNN algorithm, manhattan distance and euclidean distance. I wrote a KNN function called “knnfunc”. Its inputs are the training data, the test data, k level and the distance measure.

knnfunc<-function(traindata,testdata,klev,dist_method){
  
  pred<-c()
  pred_class<-c()
 
  for(t in 1:nrow(testdata)){   
    eu_dist =c()         
    eu_class = c()
    uniqc<-c()
    
     #Calculation of distance between test instance and the training, this loop is faster than calculating the distance matrix
     for(z in 1:nrow(traindata)){
     eu_dist <- c(eu_dist,dist(rbind(testdata[t,-1], traindata[z,-1]),method=as.character(dist_method)))}
     eu_class<-c(eu_class,traindata[,1])
   
     eu <- data.frame(eu_class, eu_dist) 
     eu <- eu[order(eu$eu_dist),]      
     eu <- eu[c(1:klev),]               #sorting and finding out about the best k neighbors

     uniqc <- unique(eu[,"eu_class"])
     pred_class<-uniqc[which.max(tabulate(match(eu[,"eu_class"], uniqc)))]  #finding the most common occured class in the neighborhood
     pred<-c(pred,pred_class)
  }
  pred
}

I combined the coordiate information into a single matrix called “train”. Then with the below code, I applied 10-fold cross validation for both my distance measures. I have used 1 repetition, since these calculations are time consuming. You can modify the number of replications by changing the “nofReplications”. I also normalize the data since there may be differing accelerations.

train<-cbind(trainx,trainy[,-1],trainz[,-1])
train<-cbind(train[,1],scale(train[,-1]))
require(TunePareto)
k_levels=c(1:10)
nofReplications=1
nFolds=10
indices=generateCVRuns(class,nofReplications,nFolds,stratified=TRUE)
cvresult=data.table()
for(i in 1:nofReplications) {
  thisReplication=indices[[i]]
  
  for(j in 1:nFolds){
    pred1<-c()
    testindices=thisReplication[[j]]
    cvtrain=train[-testindices,]        
    cvtest=train[testindices,]
    for(y in 1:length(k_levels)){
     param_k=k_levels[y]
     pred1<-knnfunc(cvtrain,cvtest,param_k,"manhattan") #by writing "euclidean" here you can change the distance measure    
     cvresult=rbind(cvresult,data.table(Replication=i,Fold=j,Klev=k_levels[y],TestId=testindices,
                                        Predictions=pred1,Real=cvtest[,1]))
    }
    }   
}   

manhattanres<-cvresult
euclideanres<-cvresult #after running the algorithm with euclidean distance measure
manacc<-manhattanres[,list(Accu=mean(Predictions==Real)),by=list(Klev)]
eucacc<-euclideanres[,list(Accu=mean(Predictions==Real)),by=list(Klev)]
manacc
##     Klev      Accu
##  1:    1 0.9564732
##  2:    2 0.9564732
##  3:    3 0.9587054
##  4:    4 0.9575893
##  5:    5 0.9508929
##  6:    6 0.9531250
##  7:    7 0.9453125
##  8:    8 0.9497768
##  9:    9 0.9419643
## 10:   10 0.9430804
eucacc
##     Klev      Accu
##  1:    1 0.9441964
##  2:    2 0.9441964
##  3:    3 0.9441964
##  4:    4 0.9497768
##  5:    5 0.9408482
##  6:    6 0.9430804
##  7:    7 0.9375000
##  8:    8 0.9386161
##  9:    9 0.9341518
## 10:   10 0.9308036

The accuracy tables show that the maximizing k-level for manhattan is 3 and for euclidean is 4. Now, I can utilize this information for the test data.

c)

Before moving on, I normalized the test data. Then, I applied 3 level kkn with manhattan distance and 4 level knn with euclidean distance.

testx<-as.matrix(read.table("uWaveGestureLibrary_X_TEST"))
testy<-as.matrix(read.table("uWaveGestureLibrary_Y_TEST"))
testz<-as.matrix(read.table("uWaveGestureLibrary_Z_TEST"))
test<-cbind(testx,testy[,-1],testz[,-1])
test<-cbind(test[,1],scale(test[,-1]))
pred2<-c()
pred2<-knnfunc(train,test,3,"manhattan") 
testresultman=cbind(Predictions=pred2,Real=test[,1])
pred2<-c()
pred2<-knnfunc(train,test,4,"euclidean") 
testresulteuc=cbind(Predictions=pred2,Real=test[,1])

The accuracy, run time and confusion matrices can be found below.

mean(testresultman[,1]==testresultman[,2])
## [1] 0.9508654
table(testresultman[,2],testresultman[,1])
##    
##       1   2   3   4   5   6   7   8
##   1 432   0   0   2   0   3   0   0
##   2   1 451   0   0   0   0   0   0
##   3   2   0 415   0  12  20   5   0
##   4   3   0   0 384  48   8   0   7
##   5   3   0   4   2 422   2   0   0
##   6   6   0   4  12  27 400   0   0
##   7   1   0   2   0   0   0 444   0
##   8   0   0   0   1   1   0   0 458
runtime_man
##    user  system elapsed 
##  240.89    0.51  243.32
mean(testresulteuc[,1]==testresulteuc[,2])
## [1] 0.945282
table(testresulteuc[,2],testresulteuc[,1])
##    
##       1   2   3   4   5   6   7   8
##   1 431   0   0   2   0   4   0   0
##   2   1 449   0   0   0   0   2   0
##   3   1   0 416   0  15  16   6   0
##   4   5   0   0 372  60   7   0   6
##   5   3   0   7   2 419   2   0   0
##   6   3   0   3  15  29 398   1   0
##   7   0   0   3   0   0   0 444   0
##   8   0   0   0   2   1   0   0 457
runtime_euc
##    user  system elapsed 
##  236.62    0.16  238.02

The accuracy for the manhattan distance is 95% and for the euclidean distance 94% which are quite high. The confusion matrices indicate a problem with classifying gesture 4 as gesture 5. Runtimes for the algorithms are around 4 minutes. This can be reduced as there are efficient KNN packages with euclidean distance. However, I couldn’t find one with a different distance measure so I wrote my own algorithm.

Task2

a)

This time we are given ECG readings. I used the “penalized” package. I found the optimum L1 and L2 buy using a 10-fold cross validation. The package has functions called optL1 and optL2 that can be utilized for this purpose. I didn’t feel the need to scale the data, since these are all ECG readings from a human. There wouldn’t be much difference. I also assigned “0” to “-1” classified readings. Also, I used 0.5 as the decision threshold.

require(penalized)
ecg_train<-data.frame(read.table("ecgTRAIN"))
ecg_test<-data.frame(read.table("ecgTEST"))
ecg_train[,1][(ecg_train[,1]==-1)]=0
ecg_test[,1][(ecg_test[,1]==-1)]=0
b<-optL1(response=ecg_train[,1],fusedl = TRUE,penalized=ecg_train[,-1],model = "logistic",fold = 10)
c<-optL2(response=ecg_train[,1],fusedl = TRUE,penalized=ecg_train[,-1],model = "logistic",fold = 10,lambda1=b$lambda)
model<-penalized(response=ecg_train[,1],fusedl = TRUE,penalized=ecg_train[,-1],model = "logistic",data=ecg_train,lambda1 = b$lambda,lambda2 = c$lambda)
e<-predict(model,penalized=ecg_test[,-1],data=ecg_test)
e[e>0.5]=1
e[e<=0.5]=0

resultss<-data.frame(ecg_test[,1],as.vector(e))

mean(resultss[,1]==resultss[,2])
## [1] 0.82
table(resultss[,1],resultss[,2])
##    
##      0  1
##   0 25 11
##   1  7 57

With this model, we have an accuracy of 82%. I assume getting a “1” as a positive. We have a high false positive number and Less false negative. And the test data have an overall high number of positives than negatives. Maybe, the cross validation needs to be more stratified.

b)

I draw a plot with one time series and the model coefficients.

plot(coefficients(model,"all"),type="l")

temp<-ecg_train[1,-1]
temp2<-ecg_train[2,-1]
plot_ly(y=~as.numeric(t(temp)),mode="lines",type="scatter",name="ECG with 0 result")%>% add_trace(y=~as.numeric(t(temp2)),mode="lines",type="scatter",name="ECG with 1 Result")%>%add_trace(y=~coefficients(model,"all"),mode="lines",type="scatter",name="Coefficients")%>%
  layout(yaxis=list(title="ECG Result"),xaxis=list(title="Time"))

Based on the plot, the coefficents seems to correspond to the times where changes happen. They seem to indicate the direction of the change. The graph consists of a 0 class, a 1 class and the coefficients. The coefficients seem to capture the change in both classes.

c)

Now, based on the information in b, I calculated the difference between the consecutive time series observations and I created a model.

diff_train<-matrix(0,nrow = 100,ncol=96)
diff_train[,1]<-ecg_train[,1]
for(i in (3:ncol(ecg_train))){
  diff_train[,(i-1)]=ecg_train[,i]-ecg_train[,(i-1)]
}
diff_test<-matrix(0,nrow = 100,ncol=96)
diff_test[,1]<-ecg_test[,1]
for(i in (3:ncol(ecg_test))){
  diff_test[,(i-1)]=ecg_test[,i]-ecg_test[,(i-1)]
}

diff_train<-data.frame(diff_train)
diff_test<-data.frame(diff_test)
k<-optL1(response=diff_train[,1],fusedl = TRUE,penalized=diff_train[,-1],data=diff_train,model = "logistic",fold = 10)

m<-optL2(response=diff_train[,1],fusedl = TRUE,penalized=diff_train[,-1],model = "logistic",data=diff_train,fold = 10,lambda1=k$lambda,minlambda2 = 0.2)
model2<-penalized(response=diff_train[,1],fusedl = TRUE,penalized=diff_train[,-1],model = "logistic",data=diff_train,lambda1 = k$lambda,lambda2 = m$lambda)
v<-predict(model2,penalized=diff_test[,-1],data=diff_test)

v[v>0.5]=1
v[v<=0.5]=0

resultss2<-data.frame(diff_test[,1],as.vector(v))

mean(resultss2[,1]==resultss2[,2])
## [1] 0.85
table(resultss2[,1],resultss2[,2])
##    
##      0  1
##   0 27  9
##   1  6 58

With this model, our accuracy has increased and our false positive and negative number has fallen.

d)

plot(coefficients(model2,"all"),type="l")

plot_ly(y=~as.numeric(diff_train[1,-1]),mode="lines",type="scatter",name="Difference with 0")%>% add_trace(y=~coefficients(model2,"all"),mode="lines",type="scatter",name="Model Coefficients")%>% add_trace(y=~as.numeric(diff_train[5,-1]),mode="lines",type="scatter",name="Difference with 1")%>%
  layout(yaxis=list(title="ECG Result"),xaxis=list(title="Time"))

The difference is changing more in class 1. Again, the coefficients try to capture the change. This time, they capture the movements better. The fused lasso gave us a smooth coefficients (not changing rapidly) and ridge eliminated the less usefull ones. The coefficients do capture the big changes in the data.